In [1]:
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import os
In [2]:
# Sigmoid or logistic function
# For any x, output is bounded to 0 & 1.
def sigmoid_func(x):
return 1.0/(1 + math.exp(-x))
In [3]:
sigmoid_func(10)
Out[3]:
In [4]:
sigmoid_func(-100)
Out[4]:
In [5]:
sigmoid_func(0)
Out[5]:
In [6]:
# Sigmoid function example
x = pd.Series(np.arange(-8, 8, 0.5))
y = x.map(sigmoid_func)
In [7]:
x.head()
Out[7]:
In [8]:
fig = plt.figure(figsize = (12, 8))
plt.plot(x,y)
plt.ylim((-0.2, 1.2))
plt.xlabel('input')
plt.ylabel('sigmoid output')
plt.grid(True)
plt.axvline(x = 0, ymin = 0, ymax = 1, ls = 'dashed')
plt.axhline(y = 0.5, xmin = 0, xmax = 10, ls = 'dashed')
plt.axhline(y = 1.0, xmin = 0, xmax = 10, color = 'r')
plt.axhline(y = 0.0, xmin = 0, xmax = 10, color = 'r')
plt.title('Sigmoid')
Out[8]:
Example Dataset - Hours spent and Exam Results: https://en.wikipedia.org/wiki/Logistic_regression
Sigmoid function produces an output between 0 and 1 no. Input closer to 0 produces and output of 0.5 probability. Negative input produces value less than 0.5 while positive input produces value greater than 0.5
In [9]:
data_path = r'..\Data\ClassExamples\HoursExam\HoursExamResult.csv'
In [10]:
df = pd.read_csv(data_path)
Input Feature: Hours
Output: Pass (1 = pass, 0 = fail)
In [11]:
df.head()
Out[11]:
In [12]:
# optimal weights given in the wiki dataset
def straight_line(x):
return 1.5046 * x - 4.0777
In [13]:
# How does weight affect outcome
def straight_line_weight(weight1, x):
return weight1 * x - 4.0777
In [14]:
# Generate probability by running feature thru the linear model and then thru sigmoid function
y_vals = df.Hours.map(straight_line).map(sigmoid_func)
In [15]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.Hours,
y = y_vals,
color = 'b',
label = 'logistic')
plt.scatter(x = df[df.Pass == 1].Hours,
y = df[df.Pass == 1].Pass,
color = 'g',
label = 'pass')
plt.scatter(x = df[df.Pass == 0].Hours,
y = df[df.Pass == 0].Pass,
color = 'r',
label = 'fail')
plt.title('Hours Spent Reading - Pass Probability')
plt.xlabel('Hours')
plt.ylabel('Pass Probability')
plt.grid(True)
plt.xlim((0,7))
plt.ylim((-0.2,1.5))
plt.axvline(x = 2.75,
ymin = 0,
ymax=1)
plt.axhline(y = 0.5,
xmin = 0,
xmax = 6,
label = 'cutoff at 0.5',
ls = 'dashed')
plt.axvline(x = 2,
ymin = 0,
ymax = 1)
plt.axhline(y = 0.3,
xmin = 0,
xmax = 6,
label = 'cutoff at 0.3',
ls = 'dashed')
plt.axvline(x = 3,
ymin = 0,
ymax=1)
plt.axhline(y = 0.6,
xmin = 0,
xmax = 6,
label='cutoff at 0.6',
ls = 'dashed')
plt.legend()
Out[15]:
At 2.7 hours of study time, we hit 0.5 probability. So, any student who spent 2.7 hours or more would have a higher probability of passing the exam.
In the above example,
Cutoff can be adjusted; instead of 0.5, cutoff could be established at 0.4 or 0.6 depending on the nature of problem and impact of misclassification
In [16]:
weights = [0, 1, 2]
y_at_weight = {}
for w in weights:
y_calculated = []
y_at_weight[w] = y_calculated
for x in df.Hours:
y_calculated.append(sigmoid_func(straight_line_weight(w, x)))
In [17]:
y_sig_vals = y_vals.map(sigmoid_func)
In [18]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = df.Hours,
y = y_vals,
color = 'b',
label = 'logistic curve')
plt.scatter(x = df[df.Pass==1].Hours,
y = df[df.Pass==1].Pass,
color = 'g',
label = 'pass')
plt.scatter(x = df[df.Pass==0].Hours,
y = df[df.Pass==0].Pass,
color = 'r',
label = 'fail')
plt.scatter(x = df.Hours,
y = y_at_weight[0],
color = 'k',
label = 'at wt 0')
plt.scatter(x = df.Hours,
y = y_at_weight[1],
color = 'm',
label = 'at wt 1')
plt.scatter(x = df.Hours,
y = y_at_weight[2],
color = 'y',
label = 'at wt 2')
plt.xlim((0,8))
plt.ylim((-0.2, 1.5))
plt.axhline(y = 0.5,
xmin = 0,
xmax = 6,
color = 'b',
ls = 'dashed')
plt.axvline(x = 4,
ymin = 0,
ymax = 1,
color = 'm',
ls = 'dashed')
plt.xlabel('Hours')
plt.ylabel('Pass Probability')
plt.grid(True)
plt.title('How weights impact classification - cutoff 0.5')
plt.legend()
Out[18]:
Logistic Regression Cost/Loss Function
In [19]:
# Cost Function
z = pd.Series(np.linspace(0.000001, 0.999999, 100))
ypositive = -z.map(math.log)
ynegative = -z.map(lambda x: math.log(1-x))
In [20]:
fig = plt.figure(figsize = (12, 8))
plt.plot(z,
ypositive,
label = 'Loss curve for positive example')
plt.plot(z,
ynegative,
label = 'Loss curve for negative example')
plt.ylabel('Loss')
plt.xlabel('Class')
plt.title('Loss Curve')
plt.legend()
Out[20]:
Cost function is a log curve
In [21]:
def compute_logisitic_cost(y_actual, y_predicted):
y_pos_cost = y_predicted[y_actual == 1]
y_neg_cost = y_predicted[y_actual == 0]
positive_cost = (-y_pos_cost.map(math.log)).sum()
negative_cost = -y_neg_cost.map(lambda x: math.log(1 - x)).sum()
return positive_cost + negative_cost
In [22]:
# Example of how prediction vs actual impact loss
# Prediction is exact opposite of actual. Loss/Cost should be very high
actual = pd.Series([1, 0, 1])
predicted = pd.Series([0.001, .9999, 0.0001])
print('Loss: {0:0.3f}'.format(compute_logisitic_cost(actual, predicted)))
In [23]:
# Prediction is close to actual. Loss/Cost should be very low
y_actual = pd.Series([1, 0, 1])
y_predicted = pd.Series([0.9, 0.1, 0.8])
print('Loss: {0:0.3f}'.format(compute_logisitic_cost(y_actual, y_predicted)))
In [24]:
# Prediction is midpoint. Loss/Cost should be high
y_actual = pd.Series([1, 0, 1])
y_predicted = pd.Series([0.5, 0.5, 0.5])
print('Loss: {0:0.3f}'.format(compute_logisitic_cost(y_actual, y_predicted)))
In [25]:
# Prediction is midpoint. Loss/Cost should be high
y_actual = pd.Series([1, 0, 1])
y_predicted = pd.Series([0.8, 0.4, 0.7])
print('Loss: {0:0.3f}'.format(compute_logisitic_cost(y_actual, y_predicted)))
In [26]:
weight = pd.Series(np.linspace(-1.5, 5, num = 100))
cost = []
cost_at_wt = []
for w1 in weight:
y_calculated = []
for x in df.Hours:
y_calculated.append (sigmoid_func(straight_line_weight(w1, x)))
cost_at_wt.append(compute_logisitic_cost(df.Pass, pd.Series(y_calculated)))
In [27]:
fig = plt.figure(figsize = (12, 8))
plt.scatter(x = weight, y = cost_at_wt)
plt.xlabel('Weight')
plt.ylabel('Cost')
plt.grid(True)
plt.axvline(x = 1.5,
ymin = 0,
ymax = 100,
label = 'Minimal loss')
plt.axhline(y = 6.5,
xmin = 0,
xmax = 6)
plt.title('Finding optimal weights')
plt.legend()
Out[27]:
Binary Classifier Predicts positive class probability of an observation
Logistic or Sigmod function has an important property where output is between 0 and 1 for any input. This output is used by binary classifiers as a probability of positive class
True Positive - Samples that are actual-positives correctly predicted as positive
True Negative - Samples that are actual-negatives correctly predicted as negative
False Negative - Samples that are actual-positives incorrectly predicted as negative
False Positive - Samples that are actual-negatives incorrectly predicted as positive
Logistic Loss Function is parabolic in nature. It has an important property of not only telling us the loss at a given weight, but also tells us which way to go to minimize loss
Gradient Descent optimization alogrithm uses loss function to move the weights of all the features and iteratively adjusts the weights until optimal value is reached
Batch Gradient Descent predicts y value for all training examples and then adjusts the value of weights based on loss. It can converge much slower when training set is very large. Training set order does not matter as every single example in the training set is considered before making adjustments
Stochastic Gradient Descent predicts y value for next training example and immediately adjusts the value of weights.
It can converge faster when training set is very large. Training set should be random order otherwise model will not learn correctly. AWS ML uses Stochastic Gradient Descent